library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.1.2
## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.1.2
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(clValid) # For selecting cluster method and number of clusters
## Loading required package: cluster
library(factoextra) # for cluster fitting and visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(uwot) # For UMAP dimensionality reduction 
## Warning: package 'uwot' was built under R version 4.1.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(patchwork) # For arranging multiple plots
## Warning: package 'patchwork' was built under R version 4.1.2
library(dplyr)
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.2
set.seed(888) # To ensure consistent results from non-deterministic procedures
rm(list = ls()) # Removes all variables
compas.df = read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores.csv")
## New names:
## Rows: 11757 Columns: 47
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (19): name, first, last, sex, age_cat, race, c_case_number, c_charge_de... dbl
## (14): id, age, juv_fel_count, decile_score...12, juv_misd_count, juv_ot... lgl
## (2): num_r_cases, num_vr_cases dttm (2): c_jail_in, c_jail_out date (10):
## compas_screening_date, dob, c_offense_date, c_arrest_date, r_offe...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `decile_score` -> `decile_score...12`
## • `decile_score` -> `decile_score...45`
colnames(compas.df) = make.names(colnames(compas.df), unique = TRUE) # Ensures names are valid and don't contain spaces

compas.df %>% skimr::skim() # Check the skim results to identify missing data
Data summary
Name Piped data
Number of rows 11757
Number of columns 47
_______________________
Column type frequency:
character 19
Date 10
logical 2
numeric 14
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1.00 6 29 0 11584 0
first 0 1.00 2 13 0 4058 0
last 0 1.00 1 20 0 5921 0
sex 0 1.00 4 6 0 2 0
age_cat 0 1.00 7 15 0 3 0
race 0 1.00 5 16 0 6 0
c_case_number 742 0.94 13 13 0 11015 0
c_charge_degree 0 1.00 1 1 0 3 0
c_charge_desc 749 0.94 5 52 0 531 0
r_case_number 8054 0.31 13 13 0 3703 0
r_charge_degree 0 1.00 1 1 0 3 0
r_charge_desc 8114 0.31 6 52 0 352 0
vr_case_number 10875 0.08 13 13 0 882 0
vr_charge_degree 10875 0.08 4 5 0 9 0
vr_charge_desc 10875 0.08 7 37 0 88 0
v_type_of_assessment 0 1.00 16 16 0 1 0
v_score_text 0 1.00 3 6 0 4 0
type_of_assessment 0 1.00 18 18 0 1 0
score_text 0 1.00 3 6 0 4 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
compas_screening_date 0 1.00 2013-01-01 2014-12-31 2013-12-12 704
dob 0 1.00 1919-10-14 1998-03-29 1983-11-14 7800
c_offense_date 2600 0.78 1987-11-07 2014-12-30 2013-11-25 1036
c_arrest_date 9899 0.16 1997-06-18 2014-12-31 2013-09-18 802
r_offense_date 8054 0.31 2013-01-03 2016-03-29 2014-08-27 1090
r_jail_in 9297 0.21 2013-01-04 2016-03-29 2014-08-21 984
r_jail_out 9297 0.21 2013-01-05 2020-01-01 2014-10-10 953
vr_offense_date 10875 0.08 2013-01-28 2016-03-13 2014-11-19 599
v_screening_date 0 1.00 2013-01-01 2014-12-31 2013-12-12 704
screening_date 0 1.00 2013-01-01 2014-12-31 2013-12-12 704

Variable type: logical

skim_variable n_missing complete_rate mean count
num_r_cases 11757 0 NaN :
num_vr_cases 11757 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 5879.00 3394.10 1 2940 5879 8818 11757 ▇▇▇▇▇
age 0 1.00 35.14 12.02 18 25 32 43 96 ▇▅▂▁▁
juv_fel_count 0 1.00 0.06 0.45 0 0 0 0 20 ▇▁▁▁▁
decile_score…12 0 1.00 4.37 2.88 -1 2 4 7 10 ▇▇▆▆▆
juv_misd_count 0 1.00 0.08 0.45 0 0 0 0 13 ▇▁▁▁▁
juv_other_count 0 1.00 0.09 0.47 0 0 0 0 17 ▇▁▁▁▁
priors_count 0 1.00 3.08 4.69 0 0 1 4 43 ▇▁▁▁▁
days_b_screening_arrest 1180 0.90 -0.88 72.89 -597 -1 -1 -1 1057 ▁▇▁▁▁
c_days_from_compas 742 0.94 63.59 341.90 0 1 1 2 9485 ▇▁▁▁▁
is_recid 0 1.00 0.25 0.56 -1 0 0 1 1 ▁▁▇▁▅
r_days_from_arrest 9297 0.21 20.41 74.35 -1 0 0 1 993 ▇▁▁▁▁
is_violent_recid 0 1.00 0.08 0.26 0 0 0 0 1 ▇▁▁▁▁
v_decile_score 0 1.00 3.57 2.50 -1 1 3 5 10 ▇▇▆▃▂
decile_score…45 0 1.00 4.37 2.88 -1 2 4 7 10 ▇▇▆▆▆

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
c_jail_in 1180 0.9 2013-01-01 01:31:55 2016-03-11 10:26:16 2013-12-12 03:14:03 10577
c_jail_out 1180 0.9 2013-01-02 01:12:01 2020-01-01 00:00:00 2014-01-05 08:05:17 10517
compas.df <- compas.df %>% #chatgpt code help to only select 2 numeric columns
  select(priors_count, age, race, sex)

#Changing string values for sex and race in the data to numeric values so that they can be used as part of the analysis
compas.df$sex <- as.factor(compas.df$sex)
compas.df$sex <- as.numeric(compas.df$sex)
compas.df$race<- as.factor(compas.df$race)
compas.df$race<- as.numeric(compas.df$race)

## Prepare data with removing NA and scaling
scaled.compas.df = compas.df %>% 
  drop_na() %>%  # Removes rows with missing data
  #select(-priors_count)
  scale() %>% as.data.frame() 
skim(scaled.compas.df)
Data summary
Name scaled.compas.df
Number of rows 11757
Number of columns 4
_______________________
Column type frequency:
numeric 4
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
priors_count 0 1 0 1 -0.66 -0.66 -0.44 0.20 8.52 ▇▁▁▁▁
age 0 1 0 1 -1.43 -0.84 -0.26 0.65 5.06 ▇▅▂▁▁
race 0 1 0 1 -0.89 -0.89 0.50 0.50 2.59 ▇▆▂▁▁
sex 0 1 0 1 -1.96 0.51 0.51 0.51 0.51 ▂▁▁▁▇
## Select a sample of the dataset to speed processing for exploratory analysis
sample.scaled.compas.df = scaled.compas.df %>% sample_n(size = 400)
## Internal metrics
internal.cl = clValid(sample.scaled.compas.df, 
                  nClust = 2:10, 
                  clMethods = c("kmeans","pam", "agnes", "diana"),
                  maxitems = 1000, # specifies the number of cases considered
                  validation = "internal")
## Warning in clValid(sample.scaled.compas.df, nClust = 2:10, clMethods =
## c("kmeans", : rownames for data not specified, using 1:nrow(data)
## View internal metrics   
summary(internal.cl)
## 
## Clustering Methods:
##  kmeans pam agnes diana 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 
## 
## Validation Measures:
##                             2        3        4        5        6        7        8        9       10
##                                                                                                      
## kmeans Connectivity   57.3837  84.6595  87.9552  67.4298  83.4560  84.0433  99.3833 105.8353 103.4929
##        Dunn            0.0268   0.0143   0.0173   0.0180   0.0389   0.0407   0.0433   0.0433   0.0433
##        Silhouette      0.2095   0.2567   0.2833   0.3768   0.3467   0.3441   0.3439   0.3473   0.3977
## pam    Connectivity   30.3782  20.6306  58.6563  84.6000  87.2790  85.6048  81.5952  93.1036  94.0627
##        Dunn            0.0119   0.0127   0.0122   0.0149   0.0149   0.0155   0.0155   0.0210   0.0210
##        Silhouette      0.2344   0.3184   0.3493   0.3550   0.3573   0.3682   0.3875   0.4057   0.4001
## agnes  Connectivity    4.3004   4.4254   8.4794  14.5385  20.4476  21.3698  48.6893  51.4333  54.7651
##        Dunn            0.1448   0.1593   0.1622   0.1622   0.1622   0.1622   0.0402   0.0402   0.0402
##        Silhouette      0.4798   0.3793   0.3404   0.3284   0.3151   0.2894   0.3313   0.3435   0.3377
## diana  Connectivity    6.0591  27.5853  31.8857  72.9746  79.6798  79.8048  87.9885  91.1353  92.4492
##        Dunn            0.1207   0.0496   0.0519   0.0172   0.0193   0.0199   0.0200   0.0222   0.0238
##        Silhouette      0.3518   0.3646   0.3336   0.3540   0.3550   0.3464   0.3528   0.3909   0.3908
## 
## Optimal Scores:
## 
##              Score  Method Clusters
## Connectivity 4.3004 agnes  2       
## Dunn         0.1622 agnes  4       
## Silhouette   0.4798 agnes  2
plot(internal.cl)

#Agnes performs best with 2 clusters for silhouette and connectivity, for dunn agnes is slightly better with 3 clusters but 2 clusters is a close second so agnes with 2 clusters is best solution. 
## Stability  metrics
stability.cl = clValid(sample.scaled.compas.df, 
                nClust = 2:10, 
                clMethods = c("kmeans","pam", "agnes", "diana"),
                maxitems = 1700, # specifies the number of cases considered
                validation = "stability")
## Warning in clValid(sample.scaled.compas.df, nClust = 2:10, clMethods =
## c("kmeans", : rownames for data not specified, using 1:nrow(data)
## View stability metrics
summary(stability.cl)
## 
## Clustering Methods:
##  kmeans pam agnes diana 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 
## 
## Validation Measures:
##                  2      3      4      5      6      7      8      9     10
##                                                                           
## kmeans APN  0.1397 0.3167 0.3037 0.1710 0.2563 0.3584 0.4357 0.4519 0.3543
##        AD   2.3837 2.2975 2.1864 1.8086 1.8031 1.7586 1.7055 1.6832 1.5485
##        ADM  0.4452 1.0306 0.9894 0.7283 0.8701 0.8820 0.9905 0.9953 0.8967
##        FOM  0.9763 0.9796 0.9812 0.9819 0.9803 0.9741 0.9690 0.9697 0.9716
## pam    APN  0.2269 0.2531 0.2573 0.2586 0.2936 0.2947 0.3510 0.3930 0.3930
##        AD   2.3938 2.0300 1.8512 1.7360 1.6902 1.6202 1.5517 1.4882 1.4502
##        ADM  0.7922 0.7061 0.7571 0.7824 0.8483 0.8711 0.8361 0.8083 0.8035
##        FOM  0.9844 0.9802 0.9732 0.9683 0.9681 0.9667 0.9607 0.9612 0.9591
## agnes  APN  0.0474 0.0732 0.0415 0.0431 0.1180 0.2769 0.3228 0.3220 0.3953
##        AD   2.4495 2.1531 2.0504 2.0201 1.9963 1.9903 1.8350 1.7992 1.7804
##        ADM  0.1696 0.3678 0.3681 0.3715 0.4698 0.7340 0.8465 0.8614 0.9076
##        FOM  0.9830 0.9821 0.9825 0.9829 0.9838 0.9823 0.9790 0.9797 0.9763
## diana  APN  0.3381 0.3049 0.4776 0.3360 0.3479 0.3631 0.3411 0.2626 0.2819
##        AD   2.5044 2.3795 2.3571 2.0782 1.9265 1.9066 1.8427 1.6614 1.5832
##        ADM  1.0954 1.0890 1.2699 1.0594 1.0007 1.0077 0.9751 0.8941 0.8414
##        FOM  0.9784 0.9765 0.9746 0.9748 0.9728 0.9737 0.9746 0.9716 0.9718
## 
## Optimal Scores:
## 
##     Score  Method Clusters
## APN 0.0415 agnes  4       
## AD  1.4502 pam    10      
## ADM 0.1696 agnes  2       
## FOM 0.9591 pam    10
plot(stability.cl)

compas.agnes = eclust(sample.scaled.compas.df, 
                    FUNcluster = "agnes", 
                    nboot = 200,
                    seed = 888)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
# Silhouette plot
fviz_silhouette(compas.agnes)
##   cluster size ave.sil.width
## 1       1   34          0.39
## 2       2   37          0.24
## 3       3   53          0.08
## 4       4   89          0.61
## 5       5   79          0.39
## 6       6   44          0.26
## 7       7   37          0.56
## 8       8   18          0.41
## 9       9    9          0.58

#Cluster 4 has best fit
compas.agnes = eclust(sample.scaled.compas.df, 
       FUNcluster = "agnes", 
       k = 2 ,
       hc_metric = "euclidean", hc_method = "ward.D2", # Distance metric and aglomeration method
       seed = 888)

# Silhouette plot
fviz_silhouette(compas.agnes)
##   cluster size ave.sil.width
## 1       1   78          0.42
## 2       2  322          0.35

# Dendrogam plot
fviz_dend(compas.agnes) 

# Plot cluster membership in PCA space
fviz_cluster(compas.agnes)

## Apply umap to data
umap.df = umap(sample.scaled.compas.df, n_neighbors = 50, n_components = 2) %>% scale()
colnames(umap.df) = c("umap1", "umap2")
umap.df = as.data.frame(umap.df)

umap.plot = ggplot(umap.df, aes(umap1, umap2)) + 
  geom_point(size = .5) +
  labs(title = "UMAP-transformed data") +
  theme_bw() 
umap.plot

sample.scaled.compas.df = cbind(sample.scaled.compas.df, umap.df)

## More components capture more information
umap4.df = umap(sample.scaled.compas.df, n_neighbors = 50, n_components = 4) %>% scale()

## Cluster based on UMAP data
internal.cl = clValid(umap4.df, 
                  nClust = 2:15, 
                  clMethods = c("kmeans", "pam", "agnes", "diana"),
                  maxitems = 1700,
                  validation = "internal")
## Warning in clValid(umap4.df, nClust = 2:15, clMethods = c("kmeans", "pam", :
## rownames for data not specified, using 1:nrow(data)
## View internal metrics   
summary(internal.cl)
## 
## Clustering Methods:
##  kmeans pam agnes diana 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 11 12 13 14 15 
## 
## Validation Measures:
##                            2       3       4       5       6       7       8       9      10      11      12      13      14      15
##                                                                                                                                     
## kmeans Connectivity   0.0000  0.0000  7.9841 11.3762 12.4944 19.5901 26.2103 28.6071 48.1460 49.2373 52.4012 54.9413 63.3437 62.0829
##        Dunn           0.7424  0.6979  0.0154  0.0249  0.0789  0.0463  0.0414  0.0481  0.0649  0.0307  0.0382  0.0384  0.0391  0.0391
##        Silhouette     0.5927  0.7106  0.6963  0.6860  0.6749  0.6509  0.6356  0.6324  0.5854  0.5849  0.5428  0.5549  0.5364  0.5530
## pam    Connectivity   0.0000  0.0000  3.3921  9.0417 12.4944 19.1762 22.3401 29.7647 45.5619 58.5857 67.5488 68.2972 61.0607 64.2091
##        Dunn           0.7424  0.6979  0.0406  0.0454  0.0789  0.0492  0.0492  0.0338  0.0353  0.0390  0.0437  0.0483  0.0501  0.0509
##        Silhouette     0.5927  0.7106  0.6661  0.6878  0.6749  0.6522  0.6100  0.5948  0.5412  0.5417  0.5343  0.5452  0.5666  0.5527
## agnes  Connectivity   0.0000  0.0000  3.8913  7.2833 11.1496 12.3730 15.2131 20.4988 26.5647 34.6917 35.7012 36.4595 40.5647 45.5567
##        Dunn           0.7424  0.6979  0.0312  0.0327  0.0546  0.0546  0.0737  0.0743  0.0809  0.0933  0.0933  0.0933  0.1128  0.0799
##        Silhouette     0.5927  0.7106  0.6734  0.6635  0.6686  0.6433  0.6247  0.6215  0.5594  0.5736  0.5302  0.5408  0.5284  0.5374
## diana  Connectivity   0.0000  0.0000  7.9841 11.3762 24.2675 25.9675 32.2944 44.3496 48.9571 52.1210 60.0000 68.7956 72.8397 78.8056
##        Dunn           0.7424  0.6979  0.0154  0.0249  0.0270  0.0286  0.0367  0.0417  0.0481  0.0506  0.0518  0.0485  0.0520  0.0538
##        Silhouette     0.5927  0.7106  0.6963  0.6860  0.6274  0.6391  0.6151  0.5676  0.5624  0.5202  0.5179  0.5093  0.4926  0.4786
## 
## Optimal Scores:
## 
##              Score  Method Clusters
## Connectivity 0.0000 kmeans 2       
## Dunn         0.7424 kmeans 2       
## Silhouette   0.7106 kmeans 3
plot(internal.cl)

## Cluster based on UMAP data
stability.cl = clValid(umap4.df, 
                  nClust = 2:15, 
                  clMethods = c("kmeans", "pam", "agnes", "diana"),
                  maxitems = 1700,
                  validation = "stability")
## Warning in clValid(umap4.df, nClust = 2:15, clMethods = c("kmeans", "pam", :
## rownames for data not specified, using 1:nrow(data)
## View internal metrics   
summary(stability.cl)
## 
## Clustering Methods:
##  kmeans pam agnes diana 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 11 12 13 14 15 
## 
## Validation Measures:
##                  2      3      4      5      6      7      8      9     10     11     12     13     14     15
##                                                                                                              
## kmeans APN  0.0000 0.0419 0.1029 0.0879 0.0591 0.0820 0.0768 0.0910 0.1437 0.1334 0.1213 0.1184 0.1644 0.1340
##        AD   1.5863 0.8968 0.7446 0.5759 0.4085 0.3732 0.3396 0.3228 0.3137 0.2902 0.2568 0.2417 0.2309 0.2139
##        ADM  0.0000 0.1555 0.2777 0.2213 0.0760 0.0905 0.0768 0.0730 0.1103 0.0941 0.0863 0.0830 0.0893 0.0739
##        FOM  0.6582 0.4361 0.3950 0.3457 0.1609 0.1574 0.1489 0.1470 0.1462 0.1398 0.1319 0.1276 0.1121 0.1061
## pam    APN  0.3018 0.0398 0.0989 0.0189 0.0576 0.1361 0.1528 0.1542 0.1870 0.1607 0.1559 0.1784 0.1324 0.1349
##        AD   2.1369 0.8913 0.7635 0.4555 0.4078 0.3952 0.3604 0.3344 0.3148 0.2755 0.2544 0.2444 0.2218 0.2142
##        ADM  1.2229 0.1460 0.2877 0.0239 0.0703 0.1484 0.1434 0.1415 0.1527 0.1095 0.0967 0.1015 0.0739 0.0744
##        FOM  0.9780 0.4312 0.3710 0.1653 0.1634 0.1583 0.1494 0.1445 0.1386 0.1221 0.1126 0.1125 0.1078 0.1062
## agnes  APN  0.0000 0.0341 0.0980 0.0840 0.0622 0.0705 0.0938 0.1165 0.1442 0.1289 0.1138 0.1278 0.1352 0.1078
##        AD   1.5863 0.8784 0.7530 0.5940 0.4138 0.3811 0.3607 0.3372 0.3223 0.2946 0.2609 0.2476 0.2337 0.2179
##        ADM  0.0000 0.1225 0.2298 0.2078 0.0798 0.0863 0.0865 0.0928 0.1204 0.0979 0.0893 0.0925 0.0853 0.0676
##        FOM  0.6582 0.4180 0.3814 0.3425 0.1597 0.1547 0.1507 0.1493 0.1481 0.1412 0.1333 0.1259 0.1136 0.1082
## diana  APN  0.1006 0.0374 0.1215 0.0747 0.1135 0.1327 0.1309 0.1654 0.1623 0.1812 0.1711 0.1681 0.1687 0.1776
##        AD   1.7686 0.8856 0.7471 0.5737 0.4792 0.4023 0.3632 0.3439 0.3223 0.3095 0.2751 0.2487 0.2427 0.2351
##        ADM  0.4071 0.1360 0.2913 0.2150 0.1621 0.1291 0.1168 0.1219 0.1108 0.1442 0.1139 0.0849 0.0882 0.0912
##        FOM  0.8189 0.4257 0.3815 0.3462 0.2231 0.1594 0.1529 0.1500 0.1477 0.1459 0.1390 0.1221 0.1219 0.1191
## 
## Optimal Scores:
## 
##     Score  Method Clusters
## APN 0.0000 kmeans 2       
## AD  0.2139 kmeans 15      
## ADM 0.0000 kmeans 2       
## FOM 0.1061 kmeans 15
plot(stability.cl)

## UMAP and kmeans
umap.compas.kmean = eclust(umap.df, 
       FUNcluster = "kmeans", 
       k = 8,
       seed = 888)

sample.scaled.compas.df = cbind(sample.scaled.compas.df, cluster = as.factor(umap.compas.kmean$cluster))

km_umap.plot = 
  ggplot(sample.scaled.compas.df, aes(umap1, umap2, colour = cluster)) + 
  geom_point(size = 1) + 
  labs(title = "Kmeans clustering based on UMAP transformed data", x = "", y = "") +
  theme_bw() + 
  theme(legend.position = "none") 

km_umap.plot

## Hierarchical density based clustering is very sensitive to the minimum number of points
sample.scaled.compas.df$hdb.cluster = hdbscan(as.matrix(umap4.df), minPts = 15)$cluster %>%    
  as.factor()

hdbscan.plot = 
  ggplot(sample.scaled.compas.df, aes(umap1, umap2, colour = hdb.cluster)) + 
  geom_point(size = 1) + 
  labs(title = "HDBscan clustering based on UMAP transformed data", x = "", y = "") +
  theme_bw() + 
  theme(legend.position = "none") 
hdbscan.plot

#There are 3 main clusters of data in the kmeans clustering graphs. 2 big clustters have the majority of the clusters, one cluster is by itself that seems to be an outlier. However, it illustrates that 3 clusters could work well for this data but more data is required to truly undersatnd the data and its relevance.